This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
dat<-read.csv('2019-06-13-exam-pa-data-file.csv')
summary(dat)
## Crash_Score year Month Time_of_Day
## Min. : 0.010 Min. :2014 Min. : 1.00 Min. :1.000
## 1st Qu.: 3.540 1st Qu.:2015 1st Qu.: 3.00 1st Qu.:3.000
## Median : 5.660 Median :2016 Median : 7.00 Median :4.000
## Mean : 6.567 Mean :2016 Mean : 6.56 Mean :4.034
## 3rd Qu.: 8.600 3rd Qu.:2017 3rd Qu.:10.00 3rd Qu.:5.000
## Max. :53.070 Max. :2019 Max. :12.00 Max. :6.000
##
## Rd_Feature Rd_Character Rd_Class
## DRIVEWAY : 2373 CURVE-GRADE : 643 OTHER : 9960
## INTERSECTION: 6702 CURVE-LEVEL : 725 STATE HWY:10603
## NONE :13025 CURVE-OTHER : 239 US HWY : 2574
## OTHER : 259 OTHER : 13
## RAMP : 778 STRAIGHT-GRADE: 2622
## STRAIGHT-LEVEL:18215
## STRAIGHT-OTHER: 680
## Rd_Configuration Rd_Surface
## ONE-WAY : 1496 COARSE ASPHALT : 1997
## TWO-WAY-NO-MEDIAN :12076 CONCRETE : 692
## TWO-WAY-PROTECTED-MEDIAN : 2627 GROOVED CONCRETE: 371
## TWO-WAY-UNPROTECTED-MEDIAN: 6882 OTHER : 70
## UNKNOWN : 56 SMOOTH ASPHALT :20007
##
##
## Rd_Conditions Light Weather
## DRY :19262 DARK-LIT : 3219 CLEAR :17393
## ICE-SNOW-SLUSH: 322 DARK-NOT-LIT: 708 CLOUDY: 3234
## OTHER : 134 DAWN : 140 OTHER : 85
## WET : 3419 DAYLIGHT :18262 RAIN : 2230
## DUSK : 602 SNOW : 195
## OTHER : 206
##
## Traffic_Control Work_Area
## NONE :14028 NO :22823
## OTHER : 228 YES: 314
## SIGNAL : 6352
## STOP-SIGN: 2269
## YIELD : 260
##
##
vars<-colnames(dat)[5:14]
for (i in vars){
table <- as.data.frame(table(dat[,i]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
dat[,i] <- relevel(dat[,i], ref = level.name)
}
summary(dat)
## Crash_Score year Month Time_of_Day
## Min. : 0.010 Min. :2014 Min. : 1.00 Min. :1.000
## 1st Qu.: 3.540 1st Qu.:2015 1st Qu.: 3.00 1st Qu.:3.000
## Median : 5.660 Median :2016 Median : 7.00 Median :4.000
## Mean : 6.567 Mean :2016 Mean : 6.56 Mean :4.034
## 3rd Qu.: 8.600 3rd Qu.:2017 3rd Qu.:10.00 3rd Qu.:5.000
## Max. :53.070 Max. :2019 Max. :12.00 Max. :6.000
##
## Rd_Feature Rd_Character Rd_Class
## NONE :13025 STRAIGHT-LEVEL:18215 STATE HWY:10603
## DRIVEWAY : 2373 CURVE-GRADE : 643 OTHER : 9960
## INTERSECTION: 6702 CURVE-LEVEL : 725 US HWY : 2574
## OTHER : 259 CURVE-OTHER : 239
## RAMP : 778 OTHER : 13
## STRAIGHT-GRADE: 2622
## STRAIGHT-OTHER: 680
## Rd_Configuration Rd_Surface
## TWO-WAY-NO-MEDIAN :12076 SMOOTH ASPHALT :20007
## ONE-WAY : 1496 COARSE ASPHALT : 1997
## TWO-WAY-PROTECTED-MEDIAN : 2627 CONCRETE : 692
## TWO-WAY-UNPROTECTED-MEDIAN: 6882 GROOVED CONCRETE: 371
## UNKNOWN : 56 OTHER : 70
##
##
## Rd_Conditions Light Weather
## DRY :19262 DAYLIGHT :18262 CLEAR :17393
## ICE-SNOW-SLUSH: 322 DARK-LIT : 3219 CLOUDY: 3234
## OTHER : 134 DARK-NOT-LIT: 708 OTHER : 85
## WET : 3419 DAWN : 140 RAIN : 2230
## DUSK : 602 SNOW : 195
## OTHER : 206
##
## Traffic_Control Work_Area
## NONE :14028 NO :22823
## OTHER : 228 YES: 314
## SIGNAL : 6352
## STOP-SIGN: 2269
## YIELD : 260
##
##
Task 1
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
p<-ggplot(dat,aes(x=Crash_Score))+geom_histogram()
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
vars <- colnames(dat)[colnames(dat)!="Crash_Score"]
for (i in vars) {
plot <- ggplot(dat, aes(x=as.factor(dat[,i]),y=Crash_Score)) + geom_boxplot() + labs(x=i)
print(plot)
}
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
for (i in vars) {
print(i)
x <- dat %>% group_by_(i)%>%summarise(mean=mean(Crash_Score),
median=median(Crash_Score),
n = n())
print(x)
}
## [1] "year"
## Warning: group_by_() is deprecated.
## Please use group_by() instead
##
## The 'programming' vignette or the tidyeval book can help you
## to program with group_by() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.
## # A tibble: 6 x 4
## year mean median n
## <int> <dbl> <dbl> <int>
## 1 2014 6.62 5.77 3928
## 2 2015 6.53 5.58 4402
## 3 2016 6.62 5.68 4651
## 4 2017 6.58 5.66 4558
## 5 2018 6.52 5.63 4833
## 6 2019 6.42 5.6 765
## [1] "Month"
## # A tibble: 12 x 4
## Month mean median n
## <int> <dbl> <dbl> <int>
## 1 1 6.56 5.76 2052
## 2 2 6.67 5.66 1947
## 3 3 6.54 5.67 1887
## 4 4 6.61 5.68 1705
## 5 5 6.52 5.71 2013
## 6 6 6.52 5.53 1864
## 7 7 6.51 5.65 1671
## 8 8 6.53 5.7 1926
## 9 9 6.54 5.65 1843
## 10 10 6.65 5.64 2186
## 11 11 6.48 5.57 1972
## 12 12 6.66 5.65 2071
## [1] "Time_of_Day"
## # A tibble: 6 x 4
## Time_of_Day mean median n
## <int> <dbl> <dbl> <int>
## 1 1 5.87 4.86 808
## 2 2 6.38 5.44 1627
## 3 3 6.63 5.7 4827
## 4 4 6.55 5.71 6939
## 5 5 6.74 5.79 6579
## 6 6 6.38 5.51 2357
## [1] "Rd_Feature"
## # A tibble: 5 x 4
## Rd_Feature mean median n
## <fct> <dbl> <dbl> <int>
## 1 NONE 6.38 5.45 13025
## 2 DRIVEWAY 6.23 5.52 2373
## 3 INTERSECTION 7.08 6.2 6702
## 4 OTHER 6.35 5.25 259
## 5 RAMP 6.29 5.50 778
## [1] "Rd_Character"
## # A tibble: 7 x 4
## Rd_Character mean median n
## <fct> <dbl> <dbl> <int>
## 1 STRAIGHT-LEVEL 6.60 5.67 18215
## 2 CURVE-GRADE 6.09 5.06 643
## 3 CURVE-LEVEL 6.15 5.06 725
## 4 CURVE-OTHER 6.78 5.44 239
## 5 OTHER 5.66 4.16 13
## 6 STRAIGHT-GRADE 6.61 5.86 2622
## 7 STRAIGHT-OTHER 6.41 5.63 680
## [1] "Rd_Class"
## # A tibble: 3 x 4
## Rd_Class mean median n
## <fct> <dbl> <dbl> <int>
## 1 STATE HWY 6.90 5.98 10603
## 2 OTHER 6.15 5.35 9960
## 3 US HWY 6.79 5.65 2574
## [1] "Rd_Configuration"
## # A tibble: 5 x 4
## Rd_Configuration mean median n
## <fct> <dbl> <dbl> <int>
## 1 TWO-WAY-NO-MEDIAN 6.40 5.53 12076
## 2 ONE-WAY 6.26 5.38 1496
## 3 TWO-WAY-PROTECTED-MEDIAN 6.82 5.76 2627
## 4 TWO-WAY-UNPROTECTED-MEDIAN 6.84 5.96 6882
## 5 UNKNOWN 6.30 5.46 56
## [1] "Rd_Surface"
## # A tibble: 5 x 4
## Rd_Surface mean median n
## <fct> <dbl> <dbl> <int>
## 1 SMOOTH ASPHALT 6.57 5.67 20007
## 2 COARSE ASPHALT 6.64 5.75 1997
## 3 CONCRETE 6.14 5.11 692
## 4 GROOVED CONCRETE 6.73 5.8 371
## 5 OTHER 5.99 4.97 70
## [1] "Rd_Conditions"
## # A tibble: 4 x 4
## Rd_Conditions mean median n
## <fct> <dbl> <dbl> <int>
## 1 DRY 6.57 5.66 19262
## 2 ICE-SNOW-SLUSH 6.59 5.87 322
## 3 OTHER 6.23 5.63 134
## 4 WET 6.54 5.62 3419
## [1] "Light"
## # A tibble: 6 x 4
## Light mean median n
## <fct> <dbl> <dbl> <int>
## 1 DAYLIGHT 6.66 5.75 18262
## 2 DARK-LIT 6.24 5.4 3219
## 3 DARK-NOT-LIT 6.00 4.76 708
## 4 DAWN 6.32 5.44 140
## 5 DUSK 6.70 5.84 602
## 6 OTHER 5.36 4.33 206
## [1] "Weather"
## # A tibble: 5 x 4
## Weather mean median n
## <fct> <dbl> <dbl> <int>
## 1 CLEAR 6.57 5.65 17393
## 2 CLOUDY 6.52 5.68 3234
## 3 OTHER 5.65 4.84 85
## 4 RAIN 6.65 5.68 2230
## 5 SNOW 6.71 5.98 195
## [1] "Traffic_Control"
## # A tibble: 5 x 4
## Traffic_Control mean median n
## <fct> <dbl> <dbl> <int>
## 1 NONE 6.29 5.4 14028
## 2 OTHER 6.65 5.61 228
## 3 SIGNAL 7.04 6.14 6352
## 4 STOP-SIGN 6.95 6.14 2269
## 5 YIELD 6.60 5.56 260
## [1] "Work_Area"
## # A tibble: 2 x 4
## Work_Area mean median n
## <fct> <dbl> <dbl> <int>
## 1 NO 6.56 5.65 22823
## 2 YES 7.19 5.96 314
Task 2
vars <- colnames(dat)[colnames(dat)!="Crash_Score"]
for (i in vars) {
plot <- ggplot(dat, aes(x=dat[,i])) + geom_bar() + labs(x=i) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(plot)
}
dat2<-dat
library(plyr)
var<-'Traffic_Control'
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OTHER','OTHER','SIGNAL-STOP','SIGNAL-STOP','OTHER'))
table<-as.data.frame(table(dat2[,var]))
max<-which.max(table[,2])
level.name<-as.character(table[max,1])
dat2[,var]<-relevel(dat2[,var],ref=level.name)
table(dat2[,var])
##
## OTHER SIGNAL-STOP
## 14516 8621
var<-'Rd_Character'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('STRAIGHT','CURVE','CURVE','CURVE','CURVE','STRAIGHT','STRAIGHT'))
table <- as.data.frame(table(dat2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
##
## STRAIGHT CURVE
## 21517 1620
var<-'Rd_Feature'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OTHER','OTHER','INTERSECTION','OTHER','OTHER'))
table <- as.data.frame(table(dat2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
##
## OTHER INTERSECTION
## 16435 6702
var<-'Time_of_Day'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OVERNIGHT','LATE-EARLY','DAYTIME','DAYTIME','DAYTIME','LATE-NIGHT'))
table <- as.data.frame(table(dat2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
##
## DAYTIME OVERNIGHT LATE-EARLY LATE-NIGHT
## 18345 808 1627 2357
var<-'Rd_Surface'
dat2[,var] <- as.factor(dat2[,var])
var.levels <- levels(dat2[,var])
dat2[,var] <- mapvalues(dat2[,var],var.levels,c('ASPHALT','ASPHALT','OTHER','OTHER','OTHER'))
table <- as.data.frame(table(dat2[,var]))
max <- which.max(table[,2])
level.name <- as.character(table[max,1])
dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
##
## ASPHALT OTHER
## 22004 1133
Task 3
datPCA<-dat[,c('Rd_Conditions','Light','Weather')]
library(caret)
## Loading required package: lattice
varsPCA<-colnames(datPCA)
for (var in varsPCA) {
datPCA[,var]<-as.character(datPCA[,var])
}
binarizer<-caret::dummyVars(paste('~',paste(varsPCA,collapse='+')),data=datPCA,fullRank=FALSE)
datPCAbin<-data.frame(predict(binarizer,newdata=datPCA))
head(datPCAbin)
PCAweather <- prcomp(datPCAbin, center = TRUE, scale. = TRUE)
summary(PCAweather)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.829 1.3740 1.2796 1.2379 1.14429 1.03216 1.01236
## Proportion of Variance 0.223 0.1259 0.1092 0.1022 0.08729 0.07102 0.06833
## Cumulative Proportion 0.223 0.3489 0.4580 0.5602 0.64748 0.71851 0.78683
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 1.0033 0.9174 0.79731 0.64583 0.54470 5.436e-15
## Proportion of Variance 0.0671 0.0561 0.04238 0.02781 0.01978 0.000e+00
## Cumulative Proportion 0.8539 0.9100 0.95241 0.98022 1.00000 1.000e+00
## PC14 PC15
## Standard deviation 2.2e-15 1.729e-15
## Proportion of Variance 0.0e+00 0.000e+00
## Cumulative Proportion 1.0e+00 1.000e+00
PCAweather$rotation
## PC1 PC2 PC3
## Rd_ConditionsDRY -0.51165971 0.03279495 -0.074984796
## Rd_ConditionsICE.SNOW.SLUSH 0.09037524 0.08506534 0.662448145
## Rd_ConditionsOTHER 0.05610221 0.18320852 0.103092721
## Rd_ConditionsWET 0.49654749 -0.10176327 -0.161823749
## LightDARK.LIT 0.11584644 0.52794265 -0.134963861
## LightDARK.NOT.LIT 0.05371675 0.19840327 -0.012771256
## LightDAWN 0.03037488 0.07312351 0.008834873
## LightDAYLIGHT -0.14979749 -0.66027088 0.122825366
## LightDUSK 0.04011811 0.17211754 -0.069299885
## LightOTHER 0.03196240 0.20556965 0.097572239
## WeatherCLEAR -0.45856690 0.18940018 -0.043504511
## WeatherCLOUDY 0.16796308 -0.22634633 0.028404961
## WeatherOTHER 0.05593982 0.14313571 0.095611440
## WeatherRAIN 0.43250589 -0.06252514 -0.190678603
## WeatherSNOW 0.09667013 0.07063727 0.650123103
## PC4 PC5 PC6
## Rd_ConditionsDRY 0.01484130 -0.18123640 -0.000304546
## Rd_ConditionsICE.SNOW.SLUSH 0.17309970 0.03263430 -0.003229391
## Rd_ConditionsOTHER -0.59102357 0.08185876 0.013871952
## Rd_ConditionsWET 0.05361910 0.16242361 -0.001579706
## LightDARK.LIT 0.20553848 -0.15082249 0.524023896
## LightDARK.NOT.LIT 0.08325274 -0.02460026 -0.666073761
## LightDAWN 0.04478967 -0.08625994 -0.148703516
## LightDAYLIGHT -0.11844823 0.19122929 0.064770906
## LightDUSK 0.03224585 -0.14753349 -0.501319635
## LightOTHER -0.48728603 0.09181326 -0.018210396
## WeatherCLEAR 0.08594496 0.35125976 -0.020935930
## WeatherCLOUDY -0.14707914 -0.76095953 0.031641146
## WeatherOTHER -0.49208702 0.06877513 0.035509159
## WeatherRAIN 0.09025035 0.35702863 -0.020451317
## WeatherSNOW 0.18605948 0.02838991 0.021438779
## PC7 PC8 PC9
## Rd_ConditionsDRY -0.015248814 -0.007849658 -0.003938895
## Rd_ConditionsICE.SNOW.SLUSH 0.028817357 -0.020239216 -0.011409533
## Rd_ConditionsOTHER -0.015819945 0.004441833 0.003261074
## Rd_ConditionsWET 0.009914945 0.013990907 0.007213656
## LightDARK.LIT -0.050731848 -0.067682247 -0.062223343
## LightDARK.NOT.LIT -0.624270732 -0.165555370 -0.071559483
## LightDAWN 0.005078405 0.969593680 -0.066722229
## LightDAYLIGHT 0.007612700 -0.009937129 -0.049517135
## LightDUSK 0.771261686 -0.147469720 -0.029538956
## LightOTHER -0.012772847 0.045476572 0.680552997
## WeatherCLEAR 0.027257542 0.030792065 0.019213521
## WeatherCLOUDY -0.080936296 -0.038591035 0.063032700
## WeatherOTHER 0.018306649 -0.019886896 -0.716703836
## WeatherRAIN 0.036838111 0.010225294 0.039016123
## WeatherSNOW 0.047153728 -0.018980562 0.018478936
## PC10 PC11 PC12
## Rd_ConditionsDRY -0.10561246 -0.174404093 0.408737115
## Rd_ConditionsICE.SNOW.SLUSH -0.02298359 0.667192980 0.139630559
## Rd_ConditionsOTHER 0.75494351 -0.019528178 0.075105047
## Rd_ConditionsWET -0.04271335 -0.032559601 -0.492233199
## LightDARK.LIT 0.04761367 0.005227977 0.014691286
## LightDARK.NOT.LIT 0.01708870 -0.022654370 0.002144386
## LightDAWN 0.01903033 -0.004732147 0.014857170
## LightDAYLIGHT 0.04151241 0.006230309 -0.009865851
## LightDUSK 0.03318236 0.002382120 -0.008017564
## LightOTHER -0.45890809 -0.004901907 -0.013905055
## WeatherCLEAR 0.06431667 0.122166890 -0.374060795
## WeatherCLOUDY 0.01221875 0.071474866 -0.067919098
## WeatherOTHER -0.44168092 -0.023803584 -0.006415880
## WeatherRAIN -0.02332117 -0.038841641 0.645813166
## WeatherSNOW 0.01730821 -0.707287146 -0.055286182
## PC13 PC14 PC15
## Rd_ConditionsDRY 0.39278871 -0.13988638 0.56181979
## Rd_ConditionsICE.SNOW.SLUSH 0.12322835 -0.04388611 0.17625793
## Rd_ConditionsOTHER 0.07982099 -0.02842716 0.11417082
## Rd_ConditionsWET 0.37329609 -0.13294435 0.53393879
## LightDARK.LIT 0.08964679 -0.53831132 -0.19670839
## LightDARK.NOT.LIT 0.04461418 -0.26789940 -0.09789514
## LightDAWN 0.02008866 -0.12062846 -0.04407975
## LightDAYLIGHT 0.10563622 -0.63432472 -0.23179337
## LightDUSK 0.04123618 -0.24761513 -0.09048291
## LightOTHER 0.02433305 -0.14611516 -0.05339304
## WeatherCLEAR 0.55120660 0.21321542 -0.33228056
## WeatherCLOUDY 0.44243475 0.17114075 -0.26671028
## WeatherOTHER 0.07719403 0.02985987 -0.04653441
## WeatherRAIN 0.37654617 0.14565401 -0.22699107
## WeatherSNOW 0.11664144 0.04511875 -0.07031426
datPCAbin.std<-as.data.frame(scale(datPCAbin))
#dat2<-dat
dat2$WETorDRY<-0.51*datPCAbin.std$Rd_ConditionsDRY+0.5*datPCAbin.std$Rd_ConditionsWET-0.46*datPCAbin.std$WeatherCLEAR+0.43*datPCAbin.std$WeatherRAIN
summary(dat2$WETorDRY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.7500 -0.3842 -0.3842 0.0000 -0.3842 2.1807
dat<-dat2
dat$Rd_Conditions<-NULL
dat$Weather<-NULL
summary(dat)
## Crash_Score year Month Time_of_Day
## Min. : 0.010 Min. :2014 Min. : 1.00 DAYTIME :18345
## 1st Qu.: 3.540 1st Qu.:2015 1st Qu.: 3.00 OVERNIGHT : 808
## Median : 5.660 Median :2016 Median : 7.00 LATE-EARLY: 1627
## Mean : 6.567 Mean :2016 Mean : 6.56 LATE-NIGHT: 2357
## 3rd Qu.: 8.600 3rd Qu.:2017 3rd Qu.:10.00
## Max. :53.070 Max. :2019 Max. :12.00
## Rd_Feature Rd_Character Rd_Class
## OTHER :16435 STRAIGHT:21517 STATE HWY:10603
## INTERSECTION: 6702 CURVE : 1620 OTHER : 9960
## US HWY : 2574
##
##
##
## Rd_Configuration Rd_Surface Light
## TWO-WAY-NO-MEDIAN :12076 ASPHALT:22004 DAYLIGHT :18262
## ONE-WAY : 1496 OTHER : 1133 DARK-LIT : 3219
## TWO-WAY-PROTECTED-MEDIAN : 2627 DARK-NOT-LIT: 708
## TWO-WAY-UNPROTECTED-MEDIAN: 6882 DAWN : 140
## UNKNOWN : 56 DUSK : 602
## OTHER : 206
## Traffic_Control Work_Area WETorDRY
## OTHER :14516 NO :22823 Min. :-1.7500
## SIGNAL-STOP: 8621 YES: 314 1st Qu.:-0.3842
## Median :-0.3842
## Mean : 0.0000
## 3rd Qu.:-0.3842
## Max. : 2.1807
Task 4
ggplot(dat,aes(x=Rd_Character,y=log(Crash_Score),fill=Rd_Class))+
geom_boxplot()+
facet_wrap(~Rd_Character,scale="free")
ggplot(dat,aes(x=Traffic_Control,y=log(Crash_Score),fill=Rd_Feature))+
geom_boxplot()+
facet_wrap(~Traffic_Control,scale="free")
Task 5
dat$Month<-as.factor(dat$Month)
levels(dat$Month)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
library(caret)
set.seed(1234)
partition<-createDataPartition(dat$Crash_Score,list=FALSE,p=.75)
train<-dat[partition, ]
test<-dat[-partition, ]
print('TRAIN')
## [1] "TRAIN"
mean(train$Crash_Score)
## [1] 6.561156
print('TEST')
## [1] "TEST"
mean(test$Crash_Score)
## [1] 6.58402
GLMols<-glm(Crash_Score~.,family = gaussian(),data=train)
summary(GLMols)
##
## Call:
## glm(formula = Crash_Score ~ ., family = gaussian(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.119 -2.970 -0.876 2.011 47.205
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 103.378410 44.597458 2.318
## year -0.047954 0.022115 -2.168
## Month2 0.228312 0.154849 1.474
## Month3 0.110909 0.156551 0.708
## Month4 0.003026 0.161885 0.019
## Month5 0.002956 0.153889 0.019
## Month6 -0.127018 0.158568 -0.801
## Month7 -0.100649 0.161562 -0.623
## Month8 -0.058145 0.155699 -0.373
## Month9 0.004877 0.158549 0.031
## Month10 0.167344 0.151545 1.104
## Month11 -0.023279 0.155254 -0.150
## Month12 0.331391 0.153274 2.162
## Time_of_DayOVERNIGHT -0.535530 0.190537 -2.811
## Time_of_DayLATE-EARLY -0.160209 0.130249 -1.230
## Time_of_DayLATE-NIGHT -0.096836 0.125186 -0.774
## Rd_FeatureINTERSECTION 0.335777 0.093745 3.582
## Rd_CharacterCURVE -0.210142 0.128721 -1.633
## Rd_ClassOTHER -0.543359 0.080441 -6.755
## Rd_ClassUS HWY 0.110509 0.131862 0.838
## Rd_ConfigurationONE-WAY -0.134412 0.141906 -0.947
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.089917 0.129186 0.696
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN -0.025951 0.084510 -0.307
## Rd_ConfigurationUNKNOWN 0.395609 0.635595 0.622
## Rd_SurfaceOTHER -0.264032 0.161973 -1.630
## LightDARK-LIT -0.317882 0.113729 -2.795
## LightDARK-NOT-LIT -0.666567 0.200865 -3.318
## LightDAWN -0.175518 0.435735 -0.403
## LightDUSK -0.183188 0.208614 -0.878
## LightOTHER -0.734078 0.348821 -2.104
## Traffic_ControlSIGNAL-STOP 0.235240 0.088518 2.658
## Work_AreaYES 0.317833 0.278329 1.142
## WETorDRY -0.005966 0.040202 -0.148
## Pr(>|t|)
## (Intercept) 0.020459 *
## year 0.030144 *
## Month2 0.140387
## Month3 0.478672
## Month4 0.985085
## Month5 0.984675
## Month6 0.423124
## Month7 0.533308
## Month8 0.708821
## Month9 0.975462
## Month10 0.269498
## Month11 0.880814
## Month12 0.030625 *
## Time_of_DayOVERNIGHT 0.004950 **
## Time_of_DayLATE-EARLY 0.218705
## Time_of_DayLATE-NIGHT 0.439218
## Rd_FeatureINTERSECTION 0.000342 ***
## Rd_CharacterCURVE 0.102585
## Rd_ClassOTHER 1.48e-11 ***
## Rd_ClassUS HWY 0.402009
## Rd_ConfigurationONE-WAY 0.343556
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.486420
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN 0.758790
## Rd_ConfigurationUNKNOWN 0.533672
## Rd_SurfaceOTHER 0.103099
## LightDARK-LIT 0.005194 **
## LightDARK-NOT-LIT 0.000907 ***
## LightDAWN 0.687094
## LightDUSK 0.379890
## LightOTHER 0.035353 *
## Traffic_ControlSIGNAL-STOP 0.007879 **
## Work_AreaYES 0.253498
## WETorDRY 0.882018
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 17.97454)
##
## Null deviance: 315875 on 17353 degrees of freedom
## Residual deviance: 311337 on 17321 degrees of freedom
## AIC: 99418
##
## Number of Fisher Scoring iterations: 2
print('AIC')
## [1] "AIC"
AIC(GLMols)
## [1] 99418.44
predict<-predict(GLMols,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285195
GLMgamma<-glm(Crash_Score~.+Traffic_Control:Rd_Feature,family = Gamma(link = 'log'),data=train)
summary(GLMgamma)
##
## Call:
## glm(formula = Crash_Score ~ . + Traffic_Control:Rd_Feature, family = Gamma(link = "log"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3179 -0.5544 -0.1435 0.2800 3.4186
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 16.4006471 6.8032872
## year -0.0071933 0.0033737
## Month2 0.0347164 0.0236225
## Month3 0.0159515 0.0238826
## Month4 0.0004506 0.0246954
## Month5 0.0004134 0.0234759
## Month6 -0.0194821 0.0241895
## Month7 -0.0162322 0.0246461
## Month8 -0.0089946 0.0237518
## Month9 -0.0008317 0.0241880
## Month10 0.0248085 0.0231181
## Month11 -0.0056385 0.0236839
## Month12 0.0510152 0.0233827
## Time_of_DayOVERNIGHT -0.0900418 0.0290688
## Time_of_DayLATE-EARLY -0.0244010 0.0198718
## Time_of_DayLATE-NIGHT -0.0141743 0.0190982
## Rd_FeatureINTERSECTION 0.0679312 0.0246778
## Rd_CharacterCURVE -0.0338970 0.0196827
## Rd_ClassOTHER -0.0819679 0.0123015
## Rd_ClassUS HWY 0.0180421 0.0201241
## Rd_ConfigurationONE-WAY -0.0215608 0.0216656
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.0140325 0.0197109
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN -0.0047520 0.0128943
## Rd_ConfigurationUNKNOWN 0.0740870 0.0969596
## Rd_SurfaceOTHER -0.0423269 0.0247223
## LightDARK-LIT -0.0505747 0.0173505
## LightDARK-NOT-LIT -0.1049072 0.0306462
## LightDAWN -0.0263254 0.0664712
## LightDUSK -0.0266138 0.0318238
## LightOTHER -0.1190162 0.0532127
## Traffic_ControlSIGNAL-STOP 0.0460243 0.0157969
## Work_AreaYES 0.0499734 0.0424600
## WETorDRY -0.0017194 0.0061328
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -0.0278197 0.0298016
## t value Pr(>|t|)
## (Intercept) 2.411 0.01593 *
## year -2.132 0.03300 *
## Month2 1.470 0.14168
## Month3 0.668 0.50420
## Month4 0.018 0.98544
## Month5 0.018 0.98595
## Month6 -0.805 0.42060
## Month7 -0.659 0.51015
## Month8 -0.379 0.70492
## Month9 -0.034 0.97257
## Month10 1.073 0.28323
## Month11 -0.238 0.81183
## Month12 2.182 0.02914 *
## Time_of_DayOVERNIGHT -3.098 0.00195 **
## Time_of_DayLATE-EARLY -1.228 0.21949
## Time_of_DayLATE-NIGHT -0.742 0.45799
## Rd_FeatureINTERSECTION 2.753 0.00592 **
## Rd_CharacterCURVE -1.722 0.08506 .
## Rd_ClassOTHER -6.663 2.76e-11 ***
## Rd_ClassUS HWY 0.897 0.36998
## Rd_ConfigurationONE-WAY -0.995 0.31967
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.712 0.47653
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN -0.369 0.71248
## Rd_ConfigurationUNKNOWN 0.764 0.44482
## Rd_SurfaceOTHER -1.712 0.08690 .
## LightDARK-LIT -2.915 0.00356 **
## LightDARK-NOT-LIT -3.423 0.00062 ***
## LightDAWN -0.396 0.69208
## LightDUSK -0.836 0.40301
## LightOTHER -2.237 0.02532 *
## Traffic_ControlSIGNAL-STOP 2.913 0.00358 **
## Work_AreaYES 1.177 0.23923
## WETorDRY -0.280 0.77920
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -0.933 0.35058
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.4182877)
##
## Null deviance: 7311.8 on 17353 degrees of freedom
## Residual deviance: 7204.0 on 17320 degrees of freedom
## AIC: 93235
##
## Number of Fisher Scoring iterations: 5
print('AIC')
## [1] "AIC"
AIC(GLMgamma)
## [1] 93235.01
predict<-predict(GLMgamma,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285252
GLMnorm<-glm(Crash_Score~.+Traffic_Control:Rd_Feature,family = gaussian(link = 'log'),data=train)
summary(GLMnorm)
##
## Call:
## glm(formula = Crash_Score ~ . + Traffic_Control:Rd_Feature, family = gaussian(link = "log"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.110 -2.966 -0.881 2.013 47.225
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 16.9230377 6.7748674
## year -0.0074520 0.0033596
## Month2 0.0346130 0.0234388
## Month3 0.0177666 0.0238523
## Month4 0.0004638 0.0248062
## Month5 0.0003880 0.0236406
## Month6 -0.0198460 0.0245978
## Month7 -0.0146273 0.0249741
## Month8 -0.0088130 0.0240056
## Month9 0.0021391 0.0242806
## Month10 0.0256225 0.0229781
## Month11 -0.0009390 0.0239097
## Month12 0.0488685 0.0230333
## Time_of_DayOVERNIGHT -0.0867608 0.0321548
## Time_of_DayLATE-EARLY -0.0239232 0.0199603
## Time_of_DayLATE-NIGHT -0.0154884 0.0195101
## Rd_FeatureINTERSECTION 0.0704499 0.0236039
## Rd_CharacterCURVE -0.0338879 0.0203990
## Rd_ClassOTHER -0.0832792 0.0123231
## Rd_ClassUS HWY 0.0190764 0.0194776
## Rd_ConfigurationONE-WAY -0.0244057 0.0227316
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.0131801 0.0190852
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN -0.0032949 0.0125138
## Rd_ConfigurationUNKNOWN 0.0515541 0.0993109
## Rd_SurfaceOTHER -0.0391858 0.0255640
## LightDARK-LIT -0.0464936 0.0176972
## LightDARK-NOT-LIT -0.1063687 0.0333406
## LightDAWN -0.0259004 0.0669502
## LightDUSK -0.0294692 0.0318988
## LightOTHER -0.1330737 0.0637014
## Traffic_ControlSIGNAL-STOP 0.0418374 0.0155124
## Work_AreaYES 0.0438690 0.0401251
## WETorDRY 0.0001240 0.0060803
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -0.0306663 0.0284740
## t value Pr(>|t|)
## (Intercept) 2.498 0.01250 *
## year -2.218 0.02656 *
## Month2 1.477 0.13976
## Month3 0.745 0.45637
## Month4 0.019 0.98508
## Month5 0.016 0.98690
## Month6 -0.807 0.41978
## Month7 -0.586 0.55809
## Month8 -0.367 0.71353
## Month9 0.088 0.92980
## Month10 1.115 0.26483
## Month11 -0.039 0.96867
## Month12 2.122 0.03388 *
## Time_of_DayOVERNIGHT -2.698 0.00698 **
## Time_of_DayLATE-EARLY -1.199 0.23072
## Time_of_DayLATE-NIGHT -0.794 0.42728
## Rd_FeatureINTERSECTION 2.985 0.00284 **
## Rd_CharacterCURVE -1.661 0.09668 .
## Rd_ClassOTHER -6.758 1.44e-11 ***
## Rd_ClassUS HWY 0.979 0.32739
## Rd_ConfigurationONE-WAY -1.074 0.28300
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.691 0.48983
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN -0.263 0.79232
## Rd_ConfigurationUNKNOWN 0.519 0.60368
## Rd_SurfaceOTHER -1.533 0.12533
## LightDARK-LIT -2.627 0.00862 **
## LightDARK-NOT-LIT -3.190 0.00142 **
## LightDAWN -0.387 0.69886
## LightDUSK -0.924 0.35559
## LightOTHER -2.089 0.03672 *
## Traffic_ControlSIGNAL-STOP 2.697 0.00700 **
## Work_AreaYES 1.093 0.27427
## WETorDRY 0.020 0.98373
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -1.077 0.28150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 17.97553)
##
## Null deviance: 315875 on 17353 degrees of freedom
## Residual deviance: 311335 on 17320 degrees of freedom
## AIC: 99420
##
## Number of Fisher Scoring iterations: 6
print('AIC')
## [1] "AIC"
AIC(GLMnorm)
## [1] 99420.35
predict<-predict(GLMnorm,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285803
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
GLMgamma1<-glm(Crash_Score~1,family = Gamma(link='log'),data=train)
stepAIC(GLMgamma1,direction = 'forward',k=log(nrow(train)),scope=list(upper=GLMgamma,lower=GLMgamma1))
## Start: AIC=93452.1
## Crash_Score ~ 1
##
## Df Deviance AIC
## + Rd_Class 2 7260.8 93351
## + Traffic_Control 1 7272.3 93368
## + Rd_Feature 1 7274.3 93373
## + Rd_Configuration 4 7293.7 93448
## + Time_of_Day 3 7298.4 93450
## <none> 7311.8 93452
## + Rd_Character 1 7309.6 93457
## + year 1 7310.3 93458
## + Rd_Surface 1 7310.4 93459
## + Work_Area 1 7311.0 93460
## + Light 5 7295.0 93461
## + WETorDRY 1 7311.6 93461
## + Month 11 7305.3 93544
##
## Step: AIC=93341.77
## Crash_Score ~ Rd_Class
##
## Df Deviance AIC
## + Rd_Feature 1 7242.4 93308
## + Traffic_Control 1 7243.8 93311
## + Time_of_Day 3 7246.2 93336
## <none> 7260.8 93342
## + Rd_Character 1 7258.5 93346
## + Rd_Surface 1 7258.6 93346
## + year 1 7259.0 93347
## + Light 5 7243.6 93349
## + Work_Area 1 7260.3 93350
## + WETorDRY 1 7260.8 93352
## + Rd_Configuration 4 7259.1 93377
## + Month 11 7255.0 93435
##
## Step: AIC=93304.48
## Crash_Score ~ Rd_Class + Rd_Feature
##
## Df Deviance AIC
## + Time_of_Day 3 7228.4 93300
## + Traffic_Control 1 7238.2 93304
## <none> 7242.4 93304
## + Rd_Character 1 7240.5 93310
## + year 1 7240.9 93311
## + Rd_Surface 1 7241.1 93311
## + Work_Area 1 7241.8 93313
## + WETorDRY 1 7242.4 93314
## + Light 5 7226.1 93314
## + Rd_Configuration 4 7241.7 93342
## + Month 11 7236.8 93399
##
## Step: AIC=93297.94
## Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day
##
## Df Deviance AIC
## <none> 7228.4 93298
## + Traffic_Control 1 7224.5 93298
## + year 1 7226.7 93304
## + Rd_Character 1 7226.9 93304
## + Rd_Surface 1 7227.3 93305
## + Work_Area 1 7227.9 93307
## + WETorDRY 1 7228.4 93308
## + Light 5 7220.8 93328
## + Rd_Configuration 4 7227.6 93335
## + Month 11 7222.8 93392
##
## Call: glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day,
## family = Gamma(link = "log"), data = train)
##
## Coefficients:
## (Intercept) Rd_ClassOTHER Rd_ClassUS HWY
## 1.907945 -0.090945 0.003077
## Rd_FeatureINTERSECTION Time_of_DayOVERNIGHT Time_of_DayLATE-EARLY
## 0.075252 -0.136333 -0.032112
## Time_of_DayLATE-NIGHT
## -0.049669
##
## Degrees of Freedom: 17353 Total (i.e. Null); 17347 Residual
## Null Deviance: 7312
## Residual Deviance: 7228 AIC: 93240
GLMgammaR<-glm(Crash_Score~Rd_Class+Rd_Feature+Time_of_Day+Traffic_Control,family=Gamma(link='log'),data=train)
summary(GLMgammaR)
##
## Call:
## glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day +
## Traffic_Control, family = Gamma(link = "log"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3199 -0.5542 -0.1433 0.2784 3.3175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.89608 0.01024 185.147 < 2e-16 ***
## Rd_ClassOTHER -0.08340 0.01134 -7.352 2.03e-13 ***
## Rd_ClassUS HWY 0.00646 0.01696 0.381 0.703231
## Rd_FeatureINTERSECTION 0.05082 0.01415 3.590 0.000331 ***
## Time_of_DayOVERNIGHT -0.13431 0.02678 -5.016 5.32e-07 ***
## Time_of_DayLATE-EARLY -0.03169 0.01945 -1.629 0.103333
## Time_of_DayLATE-NIGHT -0.05011 0.01643 -3.050 0.002293 **
## Traffic_ControlSIGNAL-STOP 0.04060 0.01337 3.037 0.002396 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.4186809)
##
## Null deviance: 7311.8 on 17353 degrees of freedom
## Residual deviance: 7224.5 on 17346 degrees of freedom
## AIC: 93236
##
## Number of Fisher Scoring iterations: 5
print('AIC')
## [1] "AIC"
AIC(GLMgammaR)
## [1] 93235.58
predict<-predict(GLMgammaR,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.284678
plot(GLMgammaR)
Task 8
GLMgammaRdat<-glm(Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day+Traffic_Control,family = Gamma(link='log'),data=dat)
summary(GLMgammaRdat)
##
## Call:
## glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day +
## Traffic_Control, family = Gamma(link = "log"), data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3222 -0.5544 -0.1428 0.2770 3.3252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.890687 0.008846 213.742 < 2e-16 ***
## Rd_ClassOTHER -0.081323 0.009822 -8.280 < 2e-16 ***
## Rd_ClassUS HWY 0.018101 0.014698 1.232 0.21813
## Rd_FeatureINTERSECTION 0.053271 0.012258 4.346 1.39e-05 ***
## Time_of_DayOVERNIGHT -0.117653 0.023295 -5.051 4.44e-07 ***
## Time_of_DayLATE-EARLY -0.050126 0.016803 -2.983 0.00286 **
## Time_of_DayLATE-NIGHT -0.044273 0.014174 -3.124 0.00179 **
## Traffic_ControlSIGNAL-STOP 0.049265 0.011571 4.257 2.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.4192278)
##
## Null deviance: 9740.7 on 23136 degrees of freedom
## Residual deviance: 9615.7 on 23129 degrees of freedom
## AIC: 124305
##
## Number of Fisher Scoring iterations: 5
Task 9
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
set.seed(42)
X<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,train)
m<-cv.glmnet(x=X,y=train$Crash_Score,family='gaussian',alpha=1)
plot(m)
m.best<-glmnet(x=X,y=train$Crash_Score,family='gaussian',lambda=m$lambda.min,alpha=1)
X.test<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,test)
m.best$beta
## 34 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## year -0.035230776
## Month2 0.157206654
## Month3 0.046209463
## Month4 .
## Month5 .
## Month6 -0.072368785
## Month7 -0.041073747
## Month8 -0.006126929
## Month9 .
## Month10 0.109130441
## Month11 .
## Month12 0.263876244
## Time_of_DayOVERNIGHT -0.481383043
## Time_of_DayLATE-EARLY -0.090698959
## Time_of_DayLATE-NIGHT -0.070069854
## Rd_FeatureINTERSECTION 0.310714390
## Rd_CharacterCURVE -0.157376229
## Rd_ClassOTHER -0.526174010
## Rd_ClassUS HWY 0.021778137
## Rd_ConfigurationONE-WAY -0.063368334
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.060092421
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN .
## Rd_ConfigurationUNKNOWN 0.044014946
## Rd_SurfaceOTHER -0.143226618
## LightDARK-LIT -0.270482724
## LightDARK-NOT-LIT -0.554771485
## LightDAWN .
## LightDUSK -0.058549755
## LightOTHER -0.559246300
## Traffic_ControlSIGNAL-STOP 0.224366735
## Work_AreaYES 0.174188162
## WETorDRY .
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP .
m.best.predict<-predict(m.best,newx=X.test)
rmse<-sqrt(sum((m.best.predict-test$Crash_Score)^2)/nrow(test))
rmse
## [1] 4.284406
set.seed(42)
X<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,train)
m<-cv.glmnet(x=X,y=train$Crash_Score,family='gaussian',alpha=0)
plot(m)
m.best<-glmnet(x=X,y=train$Crash_Score,family='gaussian',lambda=m$lambda.min,alpha=0)
X.test<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,test)
m.best$beta
## 34 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## year -0.03760358
## Month2 0.15344261
## Month3 0.06397486
## Month4 -0.01799748
## Month5 -0.01975586
## Month6 -0.12436708
## Month7 -0.10135990
## Month8 -0.06693177
## Month9 -0.01624658
## Month10 0.11120885
## Month11 -0.05227089
## Month12 0.23526357
## Time_of_DayOVERNIGHT -0.47502496
## Time_of_DayLATE-EARLY -0.12033859
## Time_of_DayLATE-NIGHT -0.11155756
## Rd_FeatureINTERSECTION 0.24403447
## Rd_CharacterCURVE -0.16884312
## Rd_ClassOTHER -0.41763915
## Rd_ClassUS HWY 0.10355680
## Rd_ConfigurationONE-WAY -0.11116457
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN 0.10725507
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN 0.03625030
## Rd_ConfigurationUNKNOWN 0.26568994
## Rd_SurfaceOTHER -0.20134860
## LightDARK-LIT -0.24001818
## LightDARK-NOT-LIT -0.50608543
## LightDAWN -0.13232862
## LightDUSK -0.12193123
## LightOTHER -0.60898119
## Traffic_ControlSIGNAL-STOP 0.21377149
## Work_AreaYES 0.26274006
## WETorDRY -0.00117400
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP 0.08556195
m.best.predict<-predict(m.best,newx=X.test)
rmse<-sqrt(sum((m.best.predict-test$Crash_Score)^2)/nrow(test))
rmse
## [1] 4.284918
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.